Data Import
Import dataset
# Import Ziesel dataset
dat <- read.csv("~/Box Sync/Zeisel_preprocessed.csv", row.names = 1)
cell_type <- read.table("~/Box Sync/Zeisel_cell_info.txt", sep = "\t", header = 1)
# Get the labels for each cell
cluster_labels <- as.numeric(as.factor(cell_type$level1class))
cell_label <- unique(cell_type$level1class)
# Randomly select a pair from the columns
set.seed(10)
rand_ind <- sample(seq(1, ncol(dat), by = 1), size = 2, replace = F)
# Assign variables for the later
col <- ncol(dat)
ind1 <- rand_ind[1]; ind2 <- rand_ind[2]
sub_dat <- dat[, c(ind1, ind2)]
cell_num <- length(unique(cluster_labels))
# the plot that the picked pair looks like
plot(dat[,ind1], dat[,ind2],
col = cluster_labels, asp = T, pch = 16,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"), )

Clustering Method (7c2 Combinations)
Method 1 : P-values in Two-sample Tests
t_matrix <- matrix(data = NA, nrow = cell_num, ncol = cell_num)
# Conduct the t.test for 7C2 times
for (i in 2:cell_num){
for (j in 1:(i-1)){
if (i == j) {next}
x <- sub_dat[which(cluster_labels == i), ]
y <- sub_dat[which(cluster_labels == j), ]
ttest_xy <- stats::t.test(x, y, alternative = "two.sided")
t_matrix[i,j] <- ttest_xy$p.value
}
}
par(mfrow = c(3,4))
for (i in 2:cell_num){
for (j in 1:(i-1)){
tmp <- which(cluster_labels %in% c(i,j))
sub_dat2 <- sub_dat[tmp, ]
x <- sub_dat2[, 1]
y <- sub_dat2[, 2]
plot(x, y, col = cluster_labels[tmp], asp = T, cex = 0.7, pch = 16,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"),
main = paste0("T.test p-value: ", round(t_matrix[i, j], 5),
"\n", cell_label[i], " and\n", cell_label[j]))
}
}


Method 2 : Classification Error (logistic regression)
# Calculate classification errors of 7c2 pairs
accuracy <- matrix(NA, nrow = cell_num, ncol = cell_num)
sub_dat <- dat[, c(ind1, ind2)]
for (i in 2:cell_num){
for (j in 1:(i-1)){
tmp <- which(cluster_labels %in% c(i,j))
cell_type2 <- cluster_labels[tmp]
sub_dat2 <- sub_dat[tmp, ]
dat_df <- data.frame(x1 = sub_dat2[,1], x2 = sub_dat2[,2],
label = as.factor(cell_type2))
log_res <- stats::glm(label ~ x1 + x2 + 1,
data = dat_df, family = "binomial")
predictions <- (stats::predict(log_res,
newdata = dat_df, type = "response") > 0.5)
conf_table <- table(cell_type2, predictions)
accuracy[i, j] <- sum(diag(conf_table)) / sum(conf_table)
}
}
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
par(mfrow = c(3,4))
for (i in 2:cell_num){
for (j in 1:(i-1)){
tmp <- which(cluster_labels %in% c(i,j))
sub_dat2 <- sub_dat[tmp, ]
x <- sub_dat2[, 1]
y <- sub_dat2[, 2]
plot(x, y, col = cluster_labels[tmp], asp = T, cex = 0.7, pch = 16,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"),
main = paste0("Accuracy: ", round(accuracy[i, j], 3),
"\n", cell_label[i], " and\n", cell_label[j]))
}
}


Method 3 : Clustering Objective Compared to Random Assignment (K-means)
.l2norm <- function(x){sqrt(sum(x^2))}
compute_ss <- function(dat, cluster_label){
stopifnot(nrow(dat) == length(cluster_label))
uniq_val <- unique(cluster_label)
mean_list <- lapply(uniq_val, function(x){
idx <- which(cluster_label == x)
mean_vec <- colMeans(dat[idx,])
})
sum(sapply(1:nrow(dat), function(i){
.l2norm(dat[i,] - mean_list[[which(uniq_val == cluster_label[i])]])
}))
}
par(mfrow = c(3,2))
for (i in 2:cell_num){
for (j in 1:(i-1)){
tmp <- which(cluster_labels %in% c(i,j))
cell_type2 <- cluster_labels[tmp]
sub_dat2 <- sub_dat[tmp, ]
set.seed(10)
new_label <- sample(1:2, size = nrow(sub_dat2), replace = T)
real_ss <- compute_ss(sub_dat2, cell_type2)
random_ss <- compute_ss(sub_dat2, new_label)
plot(sub_dat2[,1], sub_dat2[,2], cex = 0.7,
asp = T, col = as.numeric(as.factor(cell_type2)), pch = 16,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"),
main = paste0("SS: ", real_ss,
"\n", cell_label[i], " and\n", cell_label[j]))
plot(sub_dat2[,1], sub_dat2[,2],
asp = T, col = new_label, pch = 16, cex = 0.7,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"),
main = paste0("SS: ", random_ss, "\nwith Randomly Assignment",
"\nDifference : ", abs(random_ss - real_ss)))
}
}







Clustering Method (1 vs 6 cells)
Method 1 : P-values in Two-sample Tests
sub_dat <- dat[, c(ind1, ind2)]
pval_vec <- c()
# Conduct the t.test for 7C2 times
for (i in 1:cell_num){
x <- sub_dat[which(cluster_labels == i), ]
y <- sub_dat[which(cluster_labels != i), ]
ttest_xy <- stats::t.test(x, y, alternative = "two.sided")
pval_vec <- c(pval_vec, ttest_xy$p.value)
}
par(mfrow = c(2,4))
for (i in 1:cell_num){
x <- sub_dat[which(cluster_labels == i), ]
plot(sub_dat, asp = T, cex = 0.7, pch = 16,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"),
main = paste0(cell_label[i], "\nand the rest\n",
"T.test p-value: ", round(pval_vec[i], 5)))
points(x, col = cluster_labels[i], pch = 16, asp = T)
}

Method 2 : Classification Error (logistic regression)
# Calculate classification errors
accuracy_vec <- c()
for (i in 1:cell_num){
binary_cell <- ifelse(cluster_labels == i, 1, 2)
dat_df <- data.frame(x1 = sub_dat[,1], x2 = sub_dat[,2],
label = as.factor(binary_cell))
log_res <- stats::glm(label ~ x1 + x2 + 1,
data = dat_df, family = "binomial")
predictions <- (stats::predict(log_res,
newdata = dat_df, type = "response") > 0.5)
conf_table <- table(binary_cell, predictions)
accuracy_vec <- c(accuracy_vec,
sum(diag(conf_table)) / sum(conf_table))
}
par(mfrow = c(2,4))
for (i in 1:cell_num){
x <- sub_dat[which(cluster_labels == i), ]
plot(sub_dat, asp = T, cex = 0.7, pch = 16,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"),
main = paste0(cell_label[i], " and the rest\n",
"Accuracy: ", round(accuracy_vec[i], 3)))
points(x, col = cluster_labels[i], pch = 16, asp = T)
}

Method 3 : Clustering Objective Compared to Random Assignment (K-means)
par(mfrow = c(3,2))
for (i in 1:cell_num){
x <- sub_dat[which(cluster_labels == i), ]
binary_cell <- ifelse(cluster_labels == i, 1, 2)
set.seed(10)
new_label <- sample(1:2, size = nrow(sub_dat), replace = T)
y <- sub_dat[which(new_label == i), ]
real_ss <- compute_ss(sub_dat, binary_cell)
random_ss <- compute_ss(sub_dat, new_label)
plot(sub_dat[,1], sub_dat[,2], cex = 0.7,
asp = T, col = as.numeric(as.factor(binary_cell)), pch = 16,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"),
main = paste0(cell_label[i], " and the rest\n",
"SS: ", real_ss))
plot(sub_dat[,1], sub_dat[,2],
asp = T, col = new_label, pch = 16, cex = 0.7,
xlab = paste0(colnames(dat)[ind1], ", (", ind1, ")"),
ylab = paste0(colnames(dat)[ind2], ", (", ind2, ")"),
main = paste0("Randomly Assignment\n", "SS: ", random_ss,
"\nDifference : ", abs(random_ss - real_ss)))
}


